library(rvest)
library(dplyr)
library(ggplot2)
library(tidyr)
library(corrplot)
library(ape)
mod<-function(x,m){
t1<-floor(x/m)
return(x-t1*m)
}
#read in webpage
url<-"https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number#Generation_III"
pg<-read_html(url)
table_nodes<-pg %>% html_nodes("table")
#only read table nodes 2 through 8 to get list of all pokemon
table_nodes<-table_nodes[2:8]
html_table(table_nodes[1],fill=TRUE)[[1]]
create_frames<-function(node){
#get structure
structure = html_table(node,fill=TRUE)[[1]]
df = data.frame(structure[,c(2,4,5,6)])
return(df)
}
#loop over table nodes to create complete table of pokemone
region_names<-c("Kanto", "Johto", "Hoenn", "Sinnoh", "Unova", "Kalos", "Alola")
list_frames<- list()
for (i in 1:length(table_nodes)){
df<-create_frames(table_nodes[i])
#create region list
dim1<- dim(df)[1]
region<- rep(region_names[i],dim1)
df$Region<-region
list_frames[[i]]<-df
}
#create pokedex, but need to clean up
pokedex<-do.call(rbind,list_frames)
pokedex_names<-c('Ndex','Name','Primary_Type','Secondary_Type','Region')
colnames(pokedex)<-pokedex_names
pokedex<- pokedex %>%
arrange(Name)
#drop duplicate entries
pokedex<-pokedex[!duplicated(pokedex[c("Name")]),]
pokedex
##################
#for each name in the pokedex go to its page and grab the stats and move tables
#this takes a while to run
pokemon_names<-pokedex$Name
summary_tables<-list()
for (i in 1:length(pokemon_names)){
#case when two words
if (pokemon_names[i] == "Nidoran♂"){
pokemon_names[i] = "nidoran-m"
}
if (pokemon_names[i] == "Nidoran♀"){
pokemon_names[i] = "nidoran-f"
}
if (pokemon_names[i] == "Mr. Mime"){
pokemon_names[i] = "mr-mime"
}
if (pokemon_names[i] == "Mime Jr."){
pokemon_names[i] = "mime-jr"
}
if (pokemon_names[i] == "Type: Null"){
pokemon_names[i] = "type-null"
}
if (pokemon_names[i] == "Flabébé"){
pokemon_names[i] = "flabebe"
}
if (pokemon_names[i] == "Tapu Koko"){
pokemon_names[i] = "Tapu-Koko"
}
if (pokemon_names[i] == "Tapu Lele"){
pokemon_names[i] = "Tapu-LeLe"
}
if (pokemon_names[i] == "Tapu Bulu"){
pokemon_names[i] = "Tapu-Bulu"
}
if (pokemon_names[i] == "Tapu Fini"){
pokemon_names[i] = "Tapu-Fini"
}
#makelowercase
lowercase<-tolower(pokemon_names[i])
#create url
url<-paste0("https://pokemondb.net/pokedex/",lowercase)
#get table nodes
table_nodes<-read_html(url) %>%
html_nodes("table")
#use the fourth node
table<-html_table(table_nodes[4])[[1]]
#add pokemon name as column
dim<-dim(table)[1]
name<-rep(pokemon_names[i],dim)
table$Name<-name
#add to list summary tables
summary_tables[[i]]<-table
Sys.sleep(0.4)
if (mod(i,50) == 0){
print(i)
print(Sys.time())
}
}
#need to clean
pokemon_stats<-do.call(rbind,summary_tables)
#create base states df
base_stats<-pokemon_stats[,c("X1","X2","Name")]
base_stats<-spread(base_stats, X1,X2)
base_stats$Name<-pokedex$Name
base_names<-c("Name","Attack","Defense","HP","Sp_Attack","Sp_Defense","Speed","Total")
colnames(base_stats)<-base_names
#create min stats frame
min_stats<-pokemon_stats[,c("X1","X4","Name")]
min_stats<-spread(min_stats,X1,X4)
min_stats<-subset(min_stats,select=-c(Total))
min_names<-c("Name", "Min_Attack", "Min_Defense","Min_HP",
"Min_Sp_Attack","Min_Sp_Defense", "Min_Speed")
colnames(min_stats)<-min_names
min_stats$Name<-pokedex$Name
#create max stats
max_stats<-pokemon_stats[,c("X1","X5","Name")]
max_stats<-spread(max_stats,X1,X5)
max_stats<-subset(max_stats,select=-c(Total))
max_names<-c("Name", "Max_Attack", "Max_Defense","Max_HP",
"Max_Sp_Attack","Max_Sp_Defense", "Max_Speed")
colnames(max_stats)<-max_names
max_stats$Name<-pokedex$Name
#merge all 4 together
a<-merge(pokedex,base_stats,by='Name')
b<-merge(a,min_stats,by='Name')
final<-merge(b,max_stats,by='Name')
#adjust dtype
final$Min_Attack<-as.integer(final$Min_Attack)
final$Min_Defense<-as.integer(final$Min_Defense)
final$Min_HP<-as.integer(final$Min_HP)
final$Min_Sp_Attack<-as.integer(final$Min_Sp_Attack)
final$Min_Sp_Defense<-as.integer(final$Min_Sp_Defense)
final$Min_Speed<-as.integer(final$Min_Speed)
final$Max_Attack<-as.integer(final$Max_Attack)
final$Max_Defense<-as.integer(final$Max_Defense)
final$Max_HP<-as.integer(final$Max_HP)
final$Max_Sp_Attack<-as.integer(final$Max_Sp_Attack)
final$Max_Sp_Defense<-as.integer(final$Max_Sp_Defense)
final$Max_Speed<-as.integer(final$Max_Speed)
dir<-"/Users/janmichaelaustria/Google Drive/UNH Fall Expansion/UNH Fall 2019/DATA900 Webscraping"
setwd(dir)
write.csv(final,"/Users/janmichaelaustria/Google Drive/UNH Fall Expansion/UNH Fall 2019/DATA900 Webscraping/pokemon.csv", row.names = FALSE)
#import final
dir<-"/Users/janmichaelaustria/Google Drive/UNH Fall Expansion/UNH Fall 2019/DATA900 Webscraping"
setwd(dir)
final <- read.csv(file="pokemon.csv", header=TRUE, sep=",")
colnames(final)[6:24]
options(repr.plot.width=4, repr.plot.height=4)
final %>%
group_by(Region) %>%
select(c(colnames(final)[6:24])) %>%
summarise_all(funs(mean)) %>%
ggplot(aes(x=Region, y=Attack,fill=Region)) +
geom_bar(stat="identity")
options(repr.plot.width=14, repr.plot.height=7)
plot_multi_histogram <- function(df, feature, label_column) {
plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) +
geom_histogram(alpha=0.7, position="identity", aes(y = ..density..), color="black") +
geom_density(alpha=0.4) +
geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
labs(x=feature, y = "Density")
plt + guides(fill=guide_legend(title=label_column))
}
plot_multi_histogram(final,"Total","Primary_Type") + facet_wrap( ~ Region, ncol=3)
options(repr.plot.width=11, repr.plot.height=11)
ggplot(final,aes(x=Defense, y=Attack)) +
geom_point(aes(colour = factor(Primary_Type),size=Total),alpha=0.6) +
geom_text(data=subset(final, Attack > 130 | Defense > 130),aes(label=Name)) +
geom_text(data=subset(final, HP > 210),aes(label=Name)) +
geom_smooth(method = 'lm',se = TRUE,color='red')
options(repr.plot.width=20, repr.plot.height=50)
options(warn=-1)
ggplot(final, aes(y=as.integer(Attack), x=factor(Primary_Type), fill=factor(Primary_Type))) +
geom_boxplot() +
facet_wrap( ~ Region, ncol=1) +
geom_text(data=subset(final, Attack > 100 | Attack < 50),aes(label=Name),size = 7)
#coord_flip()
options(repr.plot.width=30, repr.plot.height=70)
options(warn=-1)
final %>%
filter(Region %in% c("Kanto",'Johto')) %>%
ggplot(aes(x=factor(Primary_Type), y = as.integer(Attack),fill=factor(Region))) +
geom_boxplot() +
coord_flip() +
facet_wrap(~Region,ncol = 2) +
geom_text(aes(label=Name,colour=Primary_Type,size=Attack),fontface = "bold",
position=position_jitter(width=.7,height=.5))+
scale_fill_manual(values = c("Kanto" = "grey",
"Johto" = "white"))+
scale_size(range = c(5, 12)) +
theme(axis.text.x= element_text(size =40)) +
theme(axis.text.y= element_text(size =40)) +
theme(axis.title.x = element_text(size = 40),
axis.title.y = element_text(size = 40)) +
theme(strip.text.x = element_text(size = 40))
options(repr.plot.width=7, repr.plot.height=7)
num.cols <- sapply(final, is.numeric)
cor.data <- cor(final[,num.cols])
corrPLOT<-corrplot(cor.data,method='ellipse')
options(repr.plot.width=20, repr.plot.height=20)
options(warn=-1)
final_K_J<- final %>%
filter(Region %in% c("Kanto","Johto"))
numeric_columns<-subset(final_K_J, select = -c(Name,Ndex,
Primary_Type,
Secondary_Type,
Region))
hc.complete=hclust(dist(numeric_columns), method="complete")
hc.complete$labels<-final_K_J$Name
hcd <- as.dendrogram(hc.complete)
hcd_phylo<-as.phylo(hc.complete)
hcd_phylo$tip.label<-as.character(final_K_J$Name)
colors = c("red", "blue", "green", "black","pink","magenta")
clus6 = cutree(hc.complete, 6)
plot(hcd_phylo, type = "fan", tip.color = colors[clus6],
label.offset = 1, cex = 0.7)
numeric_columns<-subset(final, select = -c(Name,Ndex,
Primary_Type,
Secondary_Type,
Region))
pc_scores<-pr.out$x
pr.out<-prcomp(numeric_columns, scale=TRUE)
#add first and second to PC scores to final
final$PC1<-pc_scores[,1]
final$PC2<-pc_scores[,2]
#creat arrors for featurs in frist two PCS
pc_arrows<-pr.out$rotation
pc_arrows<-pc_arrows[,c(1,2)]
options(repr.plot.width=10, repr.plot.height=7)
#Bi plots
p<-final %>%
ggplot(aes(x=PC1,y=PC2)) +
geom_point(aes(colour=Region,size=Attack),alpha=.7) +
geom_text(data=subset(final, Attack > 130 | Defense > 180),aes(label=Name))
#geom_text(aes(label=Name))
p + geom_segment(mapping=aes(x=0, y=0, xend=pc_arrows[1,1]*40, yend=pc_arrows[1,2]*40),
arrow=arrow(), size=1, color="red",alpha=0.5) +
geom_text(x=pc_arrows[1,1]*40, y=pc_arrows[1,2]*40, label=rownames(pc_arrows)[1], size=5,alpha=.2) +
geom_segment(mapping=aes(x=0, y=0, xend=pc_arrows[2,1]*40, yend=pc_arrows[2,2]*40),
arrow=arrow(), size=1, color="red",alpha=0.5) +
geom_text(x=pc_arrows[2,1]*40, y=pc_arrows[2,2]*40, label=rownames(pc_arrows)[2], size=5,alpha=.2) +
geom_segment(mapping=aes(x=0, y=0, xend=pc_arrows[3,1]*40, yend=pc_arrows[3,2]*40),
arrow=arrow(), size=1, color="red",alpha=0.5) +
geom_text(x=pc_arrows[3,1]*40, y=pc_arrows[3,2]*40, label=rownames(pc_arrows)[3], size=5,alpha=.2) +
geom_segment(mapping=aes(x=0, y=0, xend=pc_arrows[6,1]*40, yend=pc_arrows[6,2]*40),
arrow=arrow(), size=1, color="red",alpha=0.5) +
geom_text(x=pc_arrows[6,1]*40, y=pc_arrows[6,2]*40, label=rownames(pc_arrows)[6], size=5,alpha=.2)
options(repr.plot.width=10, repr.plot.height=7)
#denstiy biplots
final%>%
ggplot(aes(x=PC1, y=PC2) ) +
geom_hex(bins=30) +
scale_fill_continuous(type = "viridis",alpha=.8) +
theme_bw()
#lets try HDBSCAN, onlny on the first two PCS
options(warn=-1)
library(dbscan)
cl_obj<-hdbscan(subset(final,select = c(PC1,PC2)),minPts = 3)
plot(cl_obj$hc)
#put the labels back in
final$hdbscan_labels<-cl_obj$cluster
#put the labels back in
final$hdbscan_labels<-cl_obj$cluster
final %>%
ggplot(aes(x=PC1,y=PC2)) +
geom_point(aes(size=Attack,colour=factor(hdbscan_labels)),alpha=0.7) +
geom_text(data=subset(final, Attack > 135 | Defense > 180),aes(label=Name))
options(repr.plot.width=20, repr.plot.height=12)
final %>%
#filter(Region=='Sinnoh') %>%
ggplot(aes(x=factor(hdbscan_labels),y=Attack,)) +
geom_boxplot() +
geom_text(data=subset(final, Attack > 135 | Defense > 180),aes(label=Name,colour=Primary_Type,size=Attack),fontface = "bold",
position=position_jitter(width=.7,height=.5)) +
coord_flip()
#now try only clustering group 5
final_cluster_5<-final[final$hdbscan_labels == 5,]
final_cluster_5 %>%
ggplot(aes(x=PC1,PC2)) +
geom_point(aes(colour=Region,size=Attack),alpha=0.7) +
geom_text(data=subset(final_cluster_5, Attack < 150 & Attack > 120),aes(label=Name))
cl_obj_5<-hdbscan(subset(final_cluster_5,select = c(PC1,PC2)),minPts =10)
final_cluster_5$hdbscan_labels<-cl_obj_5$cluster
#check clusters
final_cluster_5 %>%
ggplot(aes(x=PC1,PC2)) +
geom_point(aes(colour=factor(hdbscan_labels),size=Total),alpha=0.7)
options(repr.plot.width=15, repr.plot.height=10)
final_cluster_5 %>%
filter(hdbscan_labels != 0) %>%
ggplot(aes(y=as.integer(Attack), x=factor(hdbscan_labels),fill=factor(hdbscan_labels))) +
geom_boxplot() +
geom_text(data=subset(final_cluster_5,Attack > 120),aes(label=Name)) +
geom_text(data=subset(final_cluster_5,Attack < 100 & Attack > 70),aes(label=Name))
#the summary stats indicate the avg stats by region are different than one another
#if we tie the avg stats to each pokemon stats maybe the classification by region will be better
library(e1071)
X<-pr.out$x
y<-final$Region
dat<-data.frame(x=X,y=as.factor(y))
svmfit=svm(y~., data=dat, kernel="linear", cost=0.1, scale=FALSE)
set.seed (1)
tune.out<-tune(svm,y~.,data=dat,kernel="linear",ranges =list(cost=c(0.001, 0.01, 0.1, 1,5,10,100)))
summary(tune.out)
best_mod_svm_lin<-tune.out$best.model
svm_lin_accuracy<-0.7317593
svmfit=svm(y~., data=dat, kernel="linear", cost=0.1, scale=FALSE)
set.seed (1)
tune.out<-tune(svm,y~.,data=dat,kernel="radial",
ranges =list(cost=c(0.001, 0.01, 0.1, 1,5,10,100)),
gamma=c(0.5,1,2,3,4))
summary(tune.out)
svm_rbf_accuracy<-0.6971605
library(tidyverse)
library(caret)
library(nnet)
log_reg <- multinom(y~., data = data_frame(X))
predicted.classes <- log_reg %>% predict(data_frame(X))
log_reg_acc<-mean(predicted.classes == y)
log_reg_acc
library(randomForest)
set.seed(1)
bag.model<-randomForest(y~.,data=X,mtry=6,importance =TRUE,ntree=55)
bag.model
random_forest_accuracy<-0.5896
library(gbm)
set.seed(1)
boost.model<-gbm(y~.,data=data_frame(X),distribution= "multinomial",
n.trees=5000, interaction.depth=4,shrinkage =0.2, verbose=F)
boost.model
predictions_boosted<-predict(boost.model,newdata = data_frame(X),n.trees=5000,type="response")
p.predBST <- apply(predictions_boosted, 1, which.max)
for (i in 1:length(y)){
if (p.predBST[i] == 1){
p.predBST[i] = 'Alola'
}
if (p.predBST[i] == 2){
p.predBST[i] = "Hoenn"
}
if (p.predBST[i] == 3){
p.predBST[i] = "Johto"
}
if (p.predBST[i] == 4){
p.predBST[i] = "Kalos"
}
if (p.predBST[i] == 5){
p.predBST[i] = "Kanto"
}
if (p.predBST[i] == 6){
p.predBST[i] = "Sinnoh"
}
if (p.predBST[i] == 7){
p.predBST[i] = "Unova"
}
}
mean(y == factor(p.predBST))
gradient_boosted_tree_accuracy<-mean(y == factor(p.predBST))
svm_lin_accuracy<-0.7317
svm_rbf_accuracy<-0.6971
log_reg_accuracy<-0.3127
random_forest_accuracy<-0.5896
grad_boost_accuracy<-0.9740
values<-rbind(0.7317,0.6971,0.3127,0.5896,0.9740)
names<-c('svm_lin_accuracy','svm_rbf_accuracy','log_reg_accuracy',
'random_forest_accuracy','grad_boost_accuracy')
summary<-cbind(names,values)
(data_frame(x=values,y=names))
!jupyter nbconvert --to html Webscraping_Pokemon.ipynb